{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### **批量拆分**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "C:\\Users\\yuhongfei\\word-pdf\\易方达中小盘混合型证券投资基金2020年中期报告【拆分】\\易方达中小盘混合型证券投资基金2020年中期报告.pdf1.pdf\n",
      "C:\\Users\\yuhongfei\\word-pdf\\易方达中小盘混合型证券投资基金2020年中期报告【拆分】\\易方达中小盘混合型证券投资基金2020年中期报告.pdf2.pdf\n",
      "C:\\Users\\yuhongfei\\word-pdf\\易方达中小盘混合型证券投资基金2020年中期报告【拆分】\\易方达中小盘混合型证券投资基金2020年中期报告.pdf3.pdf\n",
      "C:\\Users\\yuhongfei\\word-pdf\\易方达中小盘混合型证券投资基金2020年中期报告【拆分】\\易方达中小盘混合型证券投资基金2020年中期报告.pdf4.pdf\n",
      "C:\\Users\\yuhongfei\\word-pdf\\易方达中小盘混合型证券投资基金2020年中期报告【拆分】\\易方达中小盘混合型证券投资基金2020年中期报告.pdf5.pdf\n",
      "C:\\Users\\yuhongfei\\word-pdf\\易方达中小盘混合型证券投资基金2020年中期报告【拆分】\\易方达中小盘混合型证券投资基金2020年中期报告.pdf6.pdf\n",
      "C:\\Users\\yuhongfei\\word-pdf\\易方达中小盘混合型证券投资基金2020年中期报告【拆分】\\易方达中小盘混合型证券投资基金2020年中期报告.pdf7.pdf\n",
      "C:\\Users\\yuhongfei\\word-pdf\\易方达中小盘混合型证券投资基金2020年中期报告【拆分】\\易方达中小盘混合型证券投资基金2020年中期报告.pdf8.pdf\n",
      "C:\\Users\\yuhongfei\\word-pdf\\易方达中小盘混合型证券投资基金2020年中期报告【拆分】\\易方达中小盘混合型证券投资基金2020年中期报告.pdf9.pdf\n",
      "C:\\Users\\yuhongfei\\word-pdf\\易方达中小盘混合型证券投资基金2020年中期报告【拆分】\\易方达中小盘混合型证券投资基金2020年中期报告.pdf10.pdf\n",
      "文件已成功拆分，保存路径为：C:\\Users\\yuhongfei\\word-pdf\\易方达中小盘混合型证券投资基金2020年中期报告【拆分】\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "from PyPDF2 import PdfFileWriter, PdfFileReader\n",
    "\n",
    "def split_pdf(filename, filepath, save_dirpath, step=5):\n",
    "    \"\"\"\n",
    "    拆分PDF为多个小的PDF文件，\n",
    "    @param filename:文件名\n",
    "    @param filepath:文件路径\n",
    "    @param save_dirpath:保存小的PDF的文件路径\n",
    "    @param step: 每step间隔的页面生成一个文件，例如step=5，表示0-4页、5-9页...为一个文件\n",
    "    @return:\n",
    "    \"\"\"\n",
    "    if not os.path.exists(save_dirpath):\n",
    "        os.mkdir(save_dirpath)\n",
    "    pdf_reader = PdfFileReader(filepath)\n",
    "    # 读取每一页的数据\n",
    "    pages = pdf_reader.getNumPages()\n",
    "    for page in range(0, pages, step):\n",
    "        pdf_writer = PdfFileWriter()\n",
    "        # 拆分pdf，每 step 页的拆分为一个文件\n",
    "        for index in range(page, page+step):\n",
    "            if index < pages:\n",
    "                pdf_writer.addPage(pdf_reader.getPage(index))\n",
    "        # 保存拆分后的小文件\n",
    "        save_path = os.path.join(save_dirpath, filename+str(int(page/step)+1)+'.pdf')\n",
    "        print(save_path)\n",
    "        with open(save_path, \"wb\") as out:\n",
    "            pdf_writer.write(out)\n",
    "\n",
    "    print(\"文件已成功拆分，保存路径为：\"+save_dirpath)\n",
    "    \n",
    "filename = '易方达中小盘混合型证券投资基金2020年中期报告.pdf'\n",
    "filepath = os.path.join(os.getcwd(), filename)\n",
    "save_dirpath = os.path.join(os.getcwd(), '易方达中小盘混合型证券投资基金2020年中期报告【拆分】')\n",
    "split_pdf(filename, filepath, save_dirpath, step=5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### **批量合并**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from PyPDF2 import PdfFileReader, PdfFileWriter\n",
    "\n",
    "def concat_pdf(filename, read_dirpath, save_filepath):\n",
    "    \"\"\"\n",
    "    合并多个PDF文件\n",
    "    @param filename:文件名\n",
    "    @param read_dirpath:要合并的PDF目录\n",
    "    @param save_filepath:合并后的PDF文件路径\n",
    "    @return:\n",
    "    \"\"\"\n",
    "    pdf_writer = PdfFileWriter()\n",
    "    # 对文件名进行排序\n",
    "    list_filename = os.listdir(read_dirpath)\n",
    "    list_filename.sort(key=lambda x: int(x[:-4].replace(filename, \"\")))\n",
    "    for filename in list_filename:\n",
    "        print(filename)\n",
    "        filepath = os.path.join(read_dirpath, filename)\n",
    "        # 读取文件并获取文件的页数\n",
    "        pdf_reader = PdfFileReader(filepath)\n",
    "        pages = pdf_reader.getNumPages()\n",
    "        # 逐页添加\n",
    "        for page in range(pages):\n",
    "            pdf_writer.addPage(pdf_reader.getPage(page))\n",
    "    # 保存合并后的文件\n",
    "    with open(save_filepath, \"wb\") as out:\n",
    "        pdf_writer.write(out)\n",
    "    print(\"文件已成功合并，保存路径为：\"+save_filepath)\n",
    "\n",
    "filename = '易方达中小盘混合型证券投资基金2020年中期报告.pdf'\n",
    "read_dirpath = os.path.join(os.getcwd(), '易方达中小盘混合型证券投资基金2020年中期报告【拆分】')\n",
    "save_filepath = os.path.join(os.getcwd(), '易方达中小盘混合型证券投资基金2020年中期报告-合并后.pdf')\n",
    "concat_pdf(filename, read_dirpath, save_filepath)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### **提取文字内容**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pdfplumber\n",
    "\n",
    "def extract_text_info(filepath):\n",
    "    \"\"\"\n",
    "    提取PDF中的文字\n",
    "    @param filepath:文件路径\n",
    "    @return:\n",
    "    \"\"\"\n",
    "    with pdfplumber.open(filepath) as pdf:\n",
    "        # 获取第2页数据\n",
    "        page = pdf.pages[1]\n",
    "        print(page.extract_text())\n",
    "        \n",
    "filename = '易方达中小盘混合型证券投资基金2020年中期报告.pdf'\n",
    "filepath = os.path.join(os.getcwd(), filename)\n",
    "# 提取文字内容\n",
    "extract_text_info(filepath)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### **提取表格内容**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import pdfplumber\n",
    "\n",
    "def extract_table_info(filepath):\n",
    "    \"\"\"\n",
    "    提取PDF中的图表数据\n",
    "    @param filepath:\n",
    "    @return:\n",
    "    \"\"\"\n",
    "    with pdfplumber.open(filepath) as pdf:\n",
    "        # 获取第18页数据\n",
    "        page = pdf.pages[17]\n",
    "        # 如果一页有一个表格，设置表格的第一行为表头，其余为数据\n",
    "        table_info = page.extract_table()\n",
    "        df_table = pd.DataFrame(table_info[1:], columns=table_info[0])\n",
    "        df_table.to_csv('dmeo.csv', index=False, encoding='gbk')\n",
    "        \n",
    "filename = '易方达中小盘混合型证券投资基金2020年中期报告.pdf'\n",
    "filepath = os.path.join(os.getcwd(), filename)\n",
    "# 提取表格内容\n",
    "extract_table_info(filepath)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import pdfplumber\n",
    "\n",
    "def extract_table_info(filepath):\n",
    "    \"\"\"\n",
    "    提取PDF中的图表数据\n",
    "    @param filepath:\n",
    "    @return:\n",
    "    \"\"\"\n",
    "    with pdfplumber.open(filepath) as pdf:\n",
    "        # 获取第7页数据\n",
    "        page = pdf.pages[6]\n",
    "        # 如果一页有多个表格，对应的数据是一个三维数组\n",
    "        tables_info = page.extract_tables()\n",
    "        for index in range(len(tables_info)):\n",
    "            # 设置表格的第一行为表头，其余为数据\n",
    "            df_table = pd.DataFrame(tables_info[index][1:], columns=tables_info[index][0])\n",
    "            df_table.to_csv('dmeo.csv', index=False, encoding='gbk')\n",
    "        \n",
    "filename = '易方达中小盘混合型证券投资基金2020年中期报告.pdf'\n",
    "filepath = os.path.join(os.getcwd(), filename)\n",
    "# 提取表格内容\n",
    "extract_table_info(filepath)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### **提取pdf中的图片**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "文件名：C:\\Users\\yuhongfei\\word-pdf\\易方达中小盘混合型证券投资基金2020年中期报告.pdf, 页数: 46, 对象: 15284\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Deprecation: 'writePNG' removed from class 'Pixmap' after v1.19 - use 'save'.\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import re\n",
    "import fitz\n",
    "\n",
    "def extract_pic_info(filepath, pic_dirpath):\n",
    "    \"\"\"\n",
    "    提取PDF中的图片\n",
    "    @param filepath:pdf文件路径\n",
    "    @param pic_dirpath:要保存的图片目录路径\n",
    "    @return:\n",
    "    \"\"\"\n",
    "    if not os.path.exists(pic_dirpath):\n",
    "        os.makedirs(pic_dirpath)\n",
    "    # 使用正则表达式来查找图片\n",
    "    check_XObject = r\"/Type(?= */XObject)\"\n",
    "    check_Image = r\"/Subtype(?= */Image)\"\n",
    "    img_count = 0\n",
    "\n",
    "    \"\"\"1. 打开pdf，打印相关信息\"\"\"\n",
    "    pdf_info = fitz.open(filepath)\n",
    "    # 1.16.8版本用法 xref_len = doc._getXrefLength()\n",
    "    # 最新版本\n",
    "    xref_len = pdf_info.xref_length()\n",
    "    # 打印PDF的信息\n",
    "    print(\"文件名：{}, 页数: {}, 对象: {}\".format(filepath, len(pdf_info), xref_len-1))\n",
    "\n",
    "    \"\"\"2. 遍历PDF中的对象，遇到是图像才进行下一步，不然就continue\"\"\"\n",
    "    for index in range(1, xref_len):\n",
    "        # 1.16.8版本用法 text = doc._getXrefString(index)\n",
    "        # 最新版本\n",
    "        text = pdf_info.xref_object(index)\n",
    "\n",
    "        is_XObject = re.search(check_XObject, text)\n",
    "        is_Image = re.search(check_Image, text)\n",
    "        # 如果不是对象也不是图片，则不操作\n",
    "        if is_XObject or is_Image:\n",
    "            img_count += 1\n",
    "            # 根据索引生成图像\n",
    "            pix = fitz.Pixmap(pdf_info, index)\n",
    "            pic_filepath = os.path.join(pic_dirpath, 'img_' + str(img_count) + '.png')\n",
    "            \"\"\"pix.size 可以反映像素多少，简单的色素块该值较低，可以通过设置一个阈值过滤。以阈值 10000 为例过滤\"\"\"\n",
    "            # if pix.size < 10000:\n",
    "            #     continue\n",
    "\n",
    "            \"\"\"三、 将图像存为png格式\"\"\"\n",
    "            if pix.n >= 5:\n",
    "                # 先转换CMYK\n",
    "                pix = fitz.Pixmap(fitz.csRGB, pix)\n",
    "            # 存为PNG\n",
    "            pix.writePNG(pic_filepath)\n",
    "            \n",
    "filename = '易方达中小盘混合型证券投资基金2020年中期报告.pdf'\n",
    "filepath = os.path.join(os.getcwd(), filename)\n",
    "pic_dirpath = os.path.join(os.getcwd(), '易方达中小盘混合型证券投资基金2020年中期报告【文中图片】')\n",
    "# 提取图片内容\n",
    "extract_pic_info(filepath, pic_dirpath)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### **转换为图片**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from pdf2image import convert_from_path, convert_from_bytes\n",
    "\n",
    "def convert_to_pic(filepath, pic_dirpath):\n",
    "    \"\"\"\n",
    "    每一页的PDF转换成图片\n",
    "    @param filepath:pdf文件路径\n",
    "    @param pic_dirpath:图片目录路径\n",
    "    @return:\n",
    "    \"\"\"\n",
    "    print(filepath)\n",
    "    if not os.path.exists(pic_dirpath):\n",
    "        os.makedirs(pic_dirpath)\n",
    "\n",
    "    images = convert_from_bytes(open(filepath, 'rb').read())\n",
    "    # images = convert_from_path(filepath, dpi=200)\n",
    "    for image in images:\n",
    "        # 保存图片\n",
    "        pic_filepath = os.path.join(pic_dirpath, 'img_'+str(images.index(image))+'.png')\n",
    "        image.save(pic_filepath, 'PNG')\n",
    "        \n",
    "# PDF转换为图片\n",
    "convert_to_pic(filepath, pic_dirpath)\n",
    "\n",
    "\n",
    "filename = '易方达中小盘混合型证券投资基金2020年中期报告.pdf'\n",
    "filepath = os.path.join(os.getcwd(), filename)\n",
    "pic_dirpath = os.path.join(os.getcwd(), '易方达中小盘混合型证券投资基金2020年中期报告【转换为图片】')\n",
    "# PDF转换为图片\n",
    "convert_to_pic(filepath, pic_dirpath)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### **添加水印**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from copy import copy\n",
    "from PyPDF2 import PdfFileReader, PdfFileWriter\n",
    "\n",
    "def add_watermark(filepath, save_filepath, watermark_filepath):\n",
    "    \"\"\"\n",
    "    添加水印\n",
    "    @param filepath:PDF文件路径\n",
    "    @param save_filepath:最终的文件保存路径\n",
    "    @param watermark_filepath:水印PDF文件路径\n",
    "    @return:\n",
    "    \"\"\"\n",
    "    \"\"\"读取PDF水印文件\"\"\"\n",
    "    # 可以先生成一个空白A4大小的png图片，通过 https://mp.weixin.qq.com/s/_oJA6lbsdMlRRsBf6DPxsg 教程的方式给图片加水印，将图片插入到word中并最终生成一个水印PDF文档\n",
    "    watermark = PdfFileReader(watermark_filepath)\n",
    "    watermark_page = watermark.getPage(0)\n",
    "\n",
    "    pdf_reader = PdfFileReader(filepath)\n",
    "    pdf_writer = PdfFileWriter()\n",
    "\n",
    "    for page_index in range(pdf_reader.getNumPages()):\n",
    "        current_page = pdf_reader.getPage(page_index)\n",
    "        # 封面页不添加水印\n",
    "        if page_index == 0:\n",
    "            new_page = current_page\n",
    "        else:\n",
    "            new_page = copy(watermark_page)\n",
    "            new_page.mergePage(current_page)\n",
    "        pdf_writer.addPage(new_page)\n",
    "    # 保存水印后的文件\n",
    "    with open(save_filepath, \"wb\") as out:\n",
    "        pdf_writer.write(out)\n",
    "\n",
    "filename = '易方达中小盘混合型证券投资基金2020年中期报告.pdf'\n",
    "filepath = os.path.join(os.getcwd(), filename)\n",
    "save_filepath = os.path.join(os.getcwd(), '易方达中小盘混合型证券投资基金2020年中期报告-水印.pdf')\n",
    "watermark_filepath = os.path.join(os.getcwd(), 'watermark.pdf')\n",
    "# 添加水印\n",
    "add_watermark(filepath, save_filepath, watermark_filepath)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### **文档加密与解密**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from PyPDF2 import PdfFileReader, PdfFileWriter\n",
    "\n",
    "def encrypt_pdf(filepath, save_filepath, passwd='xiaoyi'):\n",
    "    \"\"\"\n",
    "    PDF文档加密\n",
    "    @param filepath:PDF文件路径\n",
    "    @param save_filepath:加密后的文件保存路径\n",
    "    @param passwd:密码\n",
    "    @return:\n",
    "    \"\"\"\n",
    "    pdf_reader = PdfFileReader(filepath)\n",
    "    pdf_writer = PdfFileWriter()\n",
    "\n",
    "    for page_index in range(pdf_reader.getNumPages()):\n",
    "        pdf_writer.addPage(pdf_reader.getPage(page_index))\n",
    "\n",
    "    # 添加密码\n",
    "    pdf_writer.encrypt(passwd)\n",
    "    with open(save_filepath, \"wb\") as out:\n",
    "        pdf_writer.write(out)\n",
    "\n",
    "filename = '易方达中小盘混合型证券投资基金2020年中期报告.pdf'\n",
    "filepath = os.path.join(os.getcwd(), filename)\n",
    "save_filepath = os.path.join(os.getcwd(), '易方达中小盘混合型证券投资基金2020年中期报告-加密后.pdf')\n",
    "# 文档加密\n",
    "encrypt_pdf(filepath, save_filepath, passwd='xiaoyi')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def decrypt_pdf(filepath, save_filepath, passwd='xiaoyi'):\n",
    "    \"\"\"\n",
    "    解密 PDF 文档并且保存为未加密的 PDF\n",
    "    @param filepath:PDF文件路径\n",
    "    @param save_filepath:解密后的文件保存路径\n",
    "    @param passwd:密码\n",
    "    @return:\n",
    "    \"\"\"\n",
    "    pdf_reader = PdfFileReader(filepath)\n",
    "    # PDF文档解密\n",
    "    pdf_reader.decrypt('xiaoyi')\n",
    "\n",
    "    pdf_writer = PdfFileWriter()\n",
    "    for page_index in range(pdf_reader.getNumPages()):\n",
    "        pdf_writer.addPage(pdf_reader.getPage(page_index))\n",
    "\n",
    "    with open(save_filepath, \"wb\") as out:\n",
    "        pdf_writer.write(out)\n",
    "\n",
    "filename = '易方达中小盘混合型证券投资基金2020年中期报告-加密后.pdf'\n",
    "filepath = os.path.join(os.getcwd(), filename)\n",
    "save_filepath = os.path.join(os.getcwd(), '易方达中小盘混合型证券投资基金2020年中期报告-解密后.pdf')\n",
    "# 文档解密\n",
    "decrypt_pdf(filepath, save_filepath, passwd='xiaoyi')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### **页面旋转**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import PyPDF2\n",
    "\n",
    "filename = '易方达中小盘混合型证券投资基金2020年中期报告.pdf'\n",
    "filepath = os.path.join(os.getcwd(), filename)\n",
    "save_filepath = os.path.join(os.getcwd(), '易方达中小盘混合型证券投资基金2020年中期报告-旋转.pdf')\n",
    "pdf_reader = PdfFileReader(filepath)\n",
    "page = pdf_reader.getPage(0)\n",
    "page.rotateClockwise(90)\n",
    "pdf_writer = PdfFileWriter()\n",
    "pdf_writer.addPage(page)\n",
    "with open(save_filepath, \"wb\") as out:\n",
    "        pdf_writer.write(out)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
